// cd /projects/data_commons/cw_code/
// qstata cw_ind_3_fk_naics12_imp.do &

global dir_proj = "/projects/data_commons/"
global dir_cw = "${dir_proj}/cw/"
global dir_lbd = "${dir_proj}/lbd/"

di "Started at $S_DATE $S_TIME"

tempfile ds_temp
global ds_temp = "`ds_temp'"

/*==============================================================================
Determine which ch_ind to combine to avoid losing ch_ind when using fk_naics12.
Then add additional column ch_ind12 to selected files.
*/

/*------------------------------------------------------------------------------
Get a list of ch_ind that "disappear"
*/

use "${dir_cw}/cw_ind_fk_imp_fk12.dta", clear
keep ch_ind
duplicates drop
save ${ds_temp}_ind, replace

use "${dir_cw}/cw_ind_fk_imp.dta", clear
keep ch_ind
duplicates drop
merge 1:1 ch_ind using ${ds_temp}_ind, gen(m_ind)
save ${ds_temp}_ind, replace

use "${dir_cw}/cw_ind_fk_imp_fk12.dta", clear
rename ch_ind ch_ind12
save ${ds_temp}_fk12ind, replace

/*
m_ind:
1: only in manually created crosswalk based on fk_naics02/07 (these are the ones that we want to deal with)
2: only in automatically generated crosswalk based on fk_naics12 (just .)
3: matched
We would like to combine 1 with some similar ch_ind in 3 to get rid of the disappearing ch_ind
*/

use ${ds_temp}_ind, clear
list if m_ind == 1

/*------------------------------------------------------------------------------
Let's take a look which ch_ind12 the establishments with ch_ind07 not in ch_ind12 are mapped to,
and reassign ch_ind07 based on the rules specified below.
*/

use "${dir_lbd}/lbd_raw_sum_fk_comp.dta", clear

/*
Note that until the last step, ch_indXX denotes the ch_ind mapped with fkXX.
In the last step, ch_ind12 denotes the improved ch_ind 
*/
foreach iver in 02 07 {
	gen fk_naics = fk`iver'
	merge m:1 fk_naics using ${dir_cw}/cw_ind_fk_imp, keep(match master) gen(m_fk`iver')
	tab fk_naics ch_ind if m_fk`iver' != 3
	drop fk_naics
	rename ch_ind ch_ind`iver'
}

/* REDACTED
tabbing year for certain ch_ind instances */

collapse (sum) emp, by(year fk12 ch_ind07)
gen fk_naics = fk12
merge m:1 fk_naics using ${ds_temp}_fk12ind, gen(m_fk12)
tab fk12 if ch_ind12 == .
/* REDACTED 
Summarizing year for certain fk12 and ch_ind12 instances */

/* 
We did not assign a fk12 for them because they no longer appear after 2008
Do we need to deal with them? At least take a look at what fk12 the est with fk07=525910,525990?
*/

gen ch_ind = ch_ind07
merge m:1 ch_ind using ${ds_temp}_ind
keep if m_ind == 1
sort ch_ind12 fk12

keep if inlist(year,2014) // Look at year 2014 (since fk_naics12-ch_ind currently uses only 2014)
collapse (sum) emp, by(ch_ind07 ch_ind12)

by ch_ind07: egen emp_tot = total(emp)
gen emps = emp / emp_tot
by ch_ind07: egen emps_max = max(emps)
/*
If more than 98% employment in the ch_ind07 are mapped to the same ch_ind12, we combine the ch_ind07 with that ch_ind12
*/
gen double ch_ind07_temp = ch_ind12 if emps == emps_max & emps > 0.98
/*
Else we deal individually
*/
tab ch_ind07 if emps_max <= 0.98

/* REDACTED 
Manual updates are made to update certain ch_ind instances */ 

by ch_ind07: egen double ch_ind07_new = mean(ch_ind07_temp)

keep ch_ind07 ch_ind07_new
rename ch_ind07 ch_ind
rename ch_ind07_new ch_ind12
duplicates drop
save ${ds_temp}_new, replace

/*------------------------------------------------------------------------------
Double check that we have dealt with the disappearing ch_ind
*/

use ${ds_temp}_ind, clear
merge 1:1 ch_ind using ${ds_temp}_new

list if m_ind == 1
/* REDACTED 
A comment on a specific ch_ind issue is redacted to abide by Census disclosure guidelines */


/*------------------------------------------------------------------------------
Add the additional columns to selected crosswalks.
*/

capture program drop p_ch_ind12
program p_ch_ind12
capture drop ch_ind12
merge m:1 ch_ind using ${ds_temp}_new
drop _merge
replace ch_ind12 = ch_ind if ch_ind12 == .
list if ch_ind != ch_ind12
end

cd ${dir_cw}/

use cw_ind_fk_imp, replace
p_ch_ind12
sort fk_naics
saveold cw_ind_fk_imp2, replace v(12)
export delimited cw_ind_fk_imp2.csv, replace


use cw_ind_naics_imp, replace
p_ch_ind12
sort naics
saveold "cw_ind_naics_imp2", replace v(12)


use cw_ind_sic_imp, replace
p_ch_ind12
sort sic
saveold "cw_ind_sic_imp2", replace v(12)


di "Ended at $S_DATE $S_TIME"
// End of do file
